Data Cleaning
data <- read.csv(file = 'final_dataset2.csv')
data = data[data[,"G"] >= 30,] #screen out less statistically significant data points
data = data[data[,"Age"] >= 22,] #salary cap
data_reduced = data[,c('X','Player', 'Pos', 'salary','Age','FG.','FT.','MPG','PPG','APG','RPG','TOPG','BPG','SPG')]
data_reduced = data_reduced[-c(which(data_reduced$Pos == "SF-SG")),]
Split into train and test 80% and 20%
set.seed(664)
shuffle = sample.int(nrow(data_reduced))
data_reduced$split = cut(shuffle, breaks = 5, labels = c("1","2","3","4","test"))
train = subset(data_reduced, split != "test", select = -split)
test = subset(data_reduced, split == "test", select = -split)
data_reduced = subset(data_reduced, select = -split)
par(mfrow=c(1,3))
hist(train$salary,freq = F, main = "Histogram of Salary", xlab = "Salary")
hist(log(train$salary),freq = F, main = "Histogram of Log of Salary", xlab = "Log of Salary")
hist(sqrt(train$salary),freq = F, main = "Histogram of Square Root of Salary", xlab = "Square Root of Salary of Salary")
model1 = lm(salary~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model1)
##
## Call:
## lm(formula = salary ~ Age + FG. + FT. + MPG + PPG + APG + RPG +
## TOPG + BPG + SPG, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13227874 -3272493 -340406 2870022 15291207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -16584115 5072428 -3.269 0.00124 **
## Age 618222 95639 6.464 5.79e-10 ***
## FG. 4734316 6456505 0.733 0.46413
## FT. -6692194 4028580 -1.661 0.09801 .
## MPG 113349 105518 1.074 0.28382
## PPG 697890 153702 4.541 8.95e-06 ***
## APG 195999 384221 0.510 0.61044
## RPG 446132 246548 1.810 0.07164 .
## TOPG -606715 1178014 -0.515 0.60701
## BPG -273372 1234923 -0.221 0.82500
## SPG 12901 1316771 0.010 0.99219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5427000 on 236 degrees of freedom
## Multiple R-squared: 0.5351, Adjusted R-squared: 0.5154
## F-statistic: 27.16 on 10 and 236 DF, p-value: < 2.2e-16
plot(model1)
model2 = lm(log(salary)~ Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model2)
##
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + MPG + PPG + APG +
## RPG + TOPG + BPG + SPG, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4695 -0.4710 0.0390 0.5645 1.8255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.87523 0.76225 12.955 < 2e-16 ***
## Age 0.10782 0.01437 7.502 1.27e-12 ***
## FG. 1.91347 0.97024 1.972 0.049759 *
## FT. -0.03295 0.60539 -0.054 0.956642
## MPG 0.05465 0.01586 3.447 0.000672 ***
## PPG 0.03611 0.02310 1.563 0.119324
## APG -0.12428 0.05774 -2.152 0.032380 *
## RPG 0.02254 0.03705 0.608 0.543603
## TOPG 0.21732 0.17702 1.228 0.220816
## BPG -0.02968 0.18558 -0.160 0.873089
## SPG 0.08239 0.19788 0.416 0.677507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8155 on 236 degrees of freedom
## Multiple R-squared: 0.5051, Adjusted R-squared: 0.4842
## F-statistic: 24.09 on 10 and 236 DF, p-value: < 2.2e-16
plot(model2)
model3 = lm(sqrt(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train)
summary(model3)
##
## Call:
## lm(formula = sqrt(salary) ~ Age + FG. + FT. + MPG + PPG + APG +
## RPG + TOPG + BPG + SPG, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2191.24 -592.62 -5.53 559.79 2364.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2822.33 847.76 -3.329 0.00101 **
## Age 117.59 15.98 7.356 3.11e-12 ***
## FG. 1470.94 1079.09 1.363 0.17414
## FT. -683.56 673.30 -1.015 0.31103
## MPG 44.87 17.64 2.544 0.01159 *
## PPG 81.96 25.69 3.191 0.00161 **
## APG -55.33 64.22 -0.862 0.38973
## RPG 53.67 41.21 1.303 0.19400
## TOPG 67.27 196.88 0.342 0.73290
## BPG -49.00 206.39 -0.237 0.81256
## SPG 39.13 220.07 0.178 0.85901
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 907 on 236 degrees of freedom
## Multiple R-squared: 0.5447, Adjusted R-squared: 0.5254
## F-statistic: 28.24 on 10 and 236 DF, p-value: < 2.2e-16
plot(model3)
Looking into different positions
train_PG = train[train[,3] == "PG", ]
train_PF = train[train[,3] == "PF", ]
train_SF = train[train[,3] == "SF", ]
train_C = train[train[,3] == "C", ]
train_SG = train[train[,3] == "SG", ]
test_PG = test[test[,3] == "PG", ]
test_PF = test[test[,3] == "PF", ]
test_SF = test[test[,3] == "SF", ]
test_C = test[test[,3] == "C", ]
test_SG = test[test[,3] == "SG", ]
First for the PG group,
library(corrplot)
## corrplot 0.92 loaded
corrplot(cor(train_PG[,-c(1,2,3)]), method = 'number')
par(mfrow=c(2,3))
hist(train_PG$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_PG$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_PG$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_PG$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_PG$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_PG$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")
hist(train_PG$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_PG$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_PG$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_PG$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")
lm1 = lm(log(train_PG$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_PG)
summary(lm1)
##
## Call:
## lm(formula = log(train_PG$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train_PG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2274 -0.3493 0.0932 0.4942 1.6099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.158469 2.865079 2.499 0.01691 *
## Age 0.143467 0.042763 3.355 0.00181 **
## FG. -0.028013 4.246212 -0.007 0.99477
## FT. 2.315910 2.428617 0.954 0.34632
## MPG 0.056338 0.055631 1.013 0.31760
## PPG 0.021056 0.058364 0.361 0.72026
## APG -0.098764 0.163019 -0.606 0.54822
## RPG 0.005885 0.174597 0.034 0.97329
## TOPG 0.265868 0.402714 0.660 0.51311
## BPG 0.721408 1.032994 0.698 0.48920
## SPG 0.486642 0.587876 0.828 0.41295
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9683 on 38 degrees of freedom
## Multiple R-squared: 0.5554, Adjusted R-squared: 0.4384
## F-statistic: 4.747 on 10 and 38 DF, p-value: 0.0002036
plot(lm1)
library(car)
## Loading required package: carData
outlierTest(lm1)
## rstudent unadjusted p-value Bonferroni p
## 314 -4.227194 0.00014859 0.0072807
## 355 -3.960325 0.00032764 0.0160540
influencePlot(lm1)
train2 = subset(train_PG,train_PG$X != 218 &train_PG$X != 314 & train_PG$X != 355 &train_PG$X != 376)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
##
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.19413 -0.40517 0.04323 0.39876 1.25242
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.14791 2.11457 3.380 0.00183 **
## Age 0.10219 0.03122 3.273 0.00245 **
## FG. 1.88851 3.21857 0.587 0.56124
## FT. 3.29799 1.76777 1.866 0.07074 .
## MPG 0.06323 0.04253 1.487 0.14626
## PPG 0.02002 0.04170 0.480 0.63427
## APG -0.14174 0.12321 -1.150 0.25801
## RPG 0.09181 0.15781 0.582 0.56457
## TOPG 0.18058 0.29576 0.611 0.54555
## BPG 0.11713 1.08238 0.108 0.91446
## SPG 0.17159 0.46036 0.373 0.71167
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6609 on 34 degrees of freedom
## Multiple R-squared: 0.6704, Adjusted R-squared: 0.5735
## F-statistic: 6.916 on 10 and 34 DF, p-value: 8.64e-06
plot(lm2)
#AIC and Mallow's Cp select model yields the same model
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)
rs$which[which.min(rs$cp),]
## (Intercept) Age FG. FT. MPG PPG
## TRUE TRUE FALSE TRUE TRUE FALSE
## APG RPG TOPG BPG SPG
## FALSE FALSE FALSE FALSE FALSE
#step(lm2)
lm3 = lm(log(salary)~Age+FT.+MPG,train2)
summary(lm3)
##
## Call:
## lm(formula = log(salary) ~ Age + FT. + MPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.48115 -0.41848 0.08693 0.51650 1.05139
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.70850 1.14506 6.732 3.94e-08 ***
## Age 0.09637 0.02621 3.677 0.000679 ***
## FT. 3.50883 1.29060 2.719 0.009562 **
## MPG 0.08850 0.01515 5.841 7.28e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6401 on 41 degrees of freedom
## Multiple R-squared: 0.6272, Adjusted R-squared: 0.5999
## F-statistic: 22.99 on 3 and 41 DF, p-value: 6.868e-09
plot(lm3)
#no sign of heteroscedasticity
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(lm3, ~ Age*FT.*MPG + I(Age^2) + I(FT.^2) + I(MPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm3
## BP = 3.6272, df = 10, p-value = 0.9626
#not transform a low p-value of heteroscedasticity
lm4 = lm(salary~Age+FT.+MPG,train2)
bptest(lm4, ~ Age*FT.*MPG + I(Age^2) + I(FT.^2) + I(MPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm4
## BP = 13.964, df = 10, p-value = 0.1746
#no sign of multicollinearity
library(car)
vif(lm3)
## Age FT. MPG
## 1.041019 1.095500 1.057356
Now, remove the MPG, and only consider the skills of the player that might affect his salary.
b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~Age+FT.+PPG + RPG,train2)
summary(lm6)
##
## Call:
## lm(formula = log(salary) ~ Age + FT. + PPG + RPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29594 -0.38783 0.00852 0.33224 1.31336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.50279 1.20550 7.053 1.58e-08 ***
## Age 0.09896 0.02674 3.700 0.000648 ***
## FT. 3.47504 1.37690 2.524 0.015682 *
## PPG 0.06962 0.02723 2.556 0.014483 *
## RPG 0.18613 0.12078 1.541 0.131180
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.651 on 40 degrees of freedom
## Multiple R-squared: 0.6237, Adjusted R-squared: 0.5861
## F-statistic: 16.57 on 4 and 40 DF, p-value: 4.367e-08
#compare rmse for original model and reduced model
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,test_PG),log(test_PG$salary))
## [1] 0.8903253
rmse(predict(lm3,test_PG),log(test_PG$salary))
## [1] 0.8233036
rmse(predict(lm6,test_PG),log(test_PG$salary))
## [1] 0.7772102
predict(lm6, test_PG, interval="predict")
## fit lwr upr
## 74 17.05761 15.59015 18.52508
## 95 14.95173 13.59520 16.30827
## 100 15.75783 14.35197 17.16368
## 286 15.55322 14.19188 16.91456
## 359 14.09199 12.72001 15.46397
## 360 15.44492 14.08005 16.80979
## 362 15.61179 14.26325 16.96033
## 385 15.03177 13.68666 16.37689
## 410 14.41061 13.04970 15.77152
## 417 15.27077 13.83427 16.70727
## 432 15.28954 13.94222 16.63685
Second for the C group,
library(corrplot)
corrplot(cor(train_C[,-c(1,2,3)]), method = 'number')
par(mfrow=c(2,3))
hist(train_C$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_C$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_C$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_C$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_C$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_C$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")
hist(train_C$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_C$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_C$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_C$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")
lm1 = lm(log(train_C$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_C)
summary(lm1)
##
## Call:
## lm(formula = log(train_C$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train_C)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.53913 -0.47077 -0.04493 0.45163 1.88494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.015055 2.354968 6.801 1.82e-08 ***
## Age 0.063203 0.029587 2.136 0.03802 *
## FG. -4.632594 2.452040 -1.889 0.06517 .
## FT. -1.547908 1.351832 -1.145 0.25811
## MPG 0.120110 0.042434 2.830 0.00687 **
## PPG -0.061823 0.063103 -0.980 0.33235
## APG -0.420992 0.168229 -2.502 0.01595 *
## RPG -0.012047 0.079125 -0.152 0.87966
## TOPG 0.520320 0.322899 1.611 0.11393
## BPG -0.345789 0.317512 -1.089 0.28180
## SPG -0.009676 0.509826 -0.019 0.98494
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7656 on 46 degrees of freedom
## Multiple R-squared: 0.5192, Adjusted R-squared: 0.4147
## F-statistic: 4.968 on 10 and 46 DF, p-value: 7.24e-05
plot(lm1)
library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 309 2.902362 0.0057165 0.32584
influencePlot(lm1)
train2 = subset(train_C, train_C$X != 77 & train_C$X != 97 & train_C$X != 112 & train_C$X != 309 & train_C$X != 333)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
##
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.40905 -0.40228 -0.01136 0.43361 1.16496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.18677 2.05173 7.889 9.43e-10 ***
## Age 0.07632 0.02913 2.620 0.01228 *
## FG. -5.90716 2.12730 -2.777 0.00824 **
## FT. -1.76820 1.18598 -1.491 0.14364
## MPG 0.12486 0.03766 3.316 0.00192 **
## PPG -0.06628 0.05660 -1.171 0.24828
## APG -0.34769 0.17556 -1.980 0.05439 .
## RPG -0.01914 0.06991 -0.274 0.78559
## TOPG 0.49642 0.33078 1.501 0.14109
## BPG -0.28894 0.28094 -1.028 0.30976
## SPG 0.43788 0.48539 0.902 0.37226
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6535 on 41 degrees of freedom
## Multiple R-squared: 0.6379, Adjusted R-squared: 0.5496
## F-statistic: 7.224 on 10 and 41 DF, p-value: 1.931e-06
plot(lm2)
#AIC and Mallow's Cp select model yields the same model
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)
rs$which[which.min(rs$cp),]
## (Intercept) Age FG. FT. MPG PPG
## TRUE TRUE TRUE TRUE TRUE FALSE
## APG RPG TOPG BPG SPG
## TRUE FALSE FALSE FALSE FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + FG. + FT.+ MPG + APG,train2)
summary(lm3)
##
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + MPG + APG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.44783 -0.45007 -0.04617 0.41905 1.39482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.42451 1.95793 8.389 7.96e-11 ***
## Age 0.07952 0.02570 3.094 0.00336 **
## FG. -5.72859 2.05632 -2.786 0.00773 **
## FT. -2.19521 1.04744 -2.096 0.04163 *
## MPG 0.10607 0.01719 6.172 1.60e-07 ***
## APG -0.25550 0.14768 -1.730 0.09032 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6454 on 46 degrees of freedom
## Multiple R-squared: 0.6038, Adjusted R-squared: 0.5607
## F-statistic: 14.02 on 5 and 46 DF, p-value: 2.48e-08
plot(lm3)
#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * FG. * FT.* MPG * APG + I(Age^2) + I(FG.^2) + I(FT.^2) + I(MPG^2) + I(APG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm3
## BP = 20.855, df = 36, p-value = 0.9794
#not transform leads to a low p-value of heteroscedasticity
lm4 = lm(salary~Age + FG. + FT.+ MPG + APG,train2)
bptest(lm4, ~ Age * FG. * FT.* MPG * APG + I(Age^2) + I(FG.^2) + I(FT.^2) + I(MPG^2) + I(APG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm4
## BP = 32.774, df = 36, p-value = 0.6228
#no sign of multicollinearity
library(car)
vif(lm3)
## Age FG. FT. MPG APG
## 1.113804 1.754544 1.648388 1.831585 2.041956
Now, remove the MPG, and only consider the skills of the player that might affect his salary.
b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age+FG.+FT.+RPG,train2)
summary(lm6)
##
## Call:
## lm(formula = log(salary) ~ Age + FG. + FT. + RPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6734 -0.4292 -0.0923 0.5551 1.1445
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.45252 2.09953 8.789 1.74e-11 ***
## Age 0.06505 0.02806 2.318 0.02485 *
## FG. -6.86474 2.21839 -3.094 0.00332 **
## FT. -2.72837 1.13792 -2.398 0.02052 *
## RPG 0.18506 0.03400 5.443 1.85e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7134 on 47 degrees of freedom
## Multiple R-squared: 0.5053, Adjusted R-squared: 0.4632
## F-statistic: 12 on 4 and 47 DF, p-value: 8.44e-07
#compare rmse for original model and reduced model
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,test_C),log(test_C$salary))
## [1] 0.9035814
rmse(predict(lm3,test_C),log(test_C$salary))
## [1] 0.9377931
rmse(predict(lm6,test_C),log(test_C$salary))
## [1] 1.03725
predict(lm6, test_C, interval="predict")
## fit lwr upr
## 5 15.39580 13.89035 16.90125
## 15 17.69028 16.05980 19.32075
## 62 15.95068 14.42018 17.48118
## 63 15.37115 13.90986 16.83244
## 72 14.71429 13.15701 16.27157
## 150 15.71457 14.23488 17.19426
## 215 15.43631 13.94066 16.93196
## 245 16.42167 14.89485 17.94848
## 271 16.31635 14.81985 17.81286
## 290 16.94133 15.40278 18.47989
## 301 16.18228 14.57069 17.79386
## 315 15.29552 13.76339 16.82765
## 375 15.93430 14.42219 17.44641
## 411 15.12408 13.64060 16.60756
## 450 16.17169 14.65599 17.68739
Second for the SG group,
library(corrplot)
corrplot(cor(train_SG[,-c(1,2,3)]), method = 'number')
par(mfrow=c(2,3))
hist(train_SG$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_SG$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_SG$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_SG$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_SG$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_SG$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")
hist(train_SG$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_SG$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_SG$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_SG$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")
lm1 = lm(log(train_SG$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_SG)
summary(lm1)
##
## Call:
## lm(formula = log(train_SG$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train_SG)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83255 -0.44192 0.01654 0.52603 1.52688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.47292 1.96487 5.839 5.82e-07 ***
## Age 0.04549 0.03037 1.498 0.14130
## FG. 0.54106 3.28136 0.165 0.86979
## FT. 0.47179 1.12032 0.421 0.67572
## MPG 0.07770 0.02846 2.730 0.00907 **
## PPG 0.02358 0.04475 0.527 0.60098
## APG 0.07345 0.14661 0.501 0.61886
## RPG -0.06213 0.13715 -0.453 0.65277
## TOPG -0.07060 0.36555 -0.193 0.84775
## BPG 0.46570 0.60771 0.766 0.44758
## SPG 0.06410 0.41413 0.155 0.87770
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7221 on 44 degrees of freedom
## Multiple R-squared: 0.646, Adjusted R-squared: 0.5655
## F-statistic: 8.028 on 10 and 44 DF, p-value: 3.443e-07
plot(lm1)
library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 145 -3.076338 0.0036374 0.20006
influencePlot(lm1)
train2 = subset(train_SG, train_SG$X != 8 & train_SG$X != 17 & train_SG$X != 79 & train_SG$X != 145)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
##
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.14855 -0.31580 -0.02903 0.41578 1.09627
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.94774 1.75651 6.802 3.54e-08 ***
## Age 0.06125 0.02761 2.218 0.03226 *
## FG. -1.65592 3.11812 -0.531 0.59831
## FT. 0.46095 1.08641 0.424 0.67363
## MPG 0.07364 0.02648 2.781 0.00821 **
## PPG 0.05902 0.04112 1.435 0.15898
## APG 0.06021 0.13363 0.451 0.65475
## RPG -0.04417 0.12773 -0.346 0.73132
## TOPG -0.22281 0.33945 -0.656 0.51534
## BPG 0.52355 0.78376 0.668 0.50797
## SPG -0.03669 0.37773 -0.097 0.92310
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6376 on 40 degrees of freedom
## Multiple R-squared: 0.7317, Adjusted R-squared: 0.6647
## F-statistic: 10.91 on 10 and 40 DF, p-value: 1.219e-08
plot(lm2)
#AIC and Mallow's Cp select model yields the same model
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)
rs$which[which.min(rs$cp),]
## (Intercept) Age FG. FT. MPG PPG
## TRUE TRUE FALSE FALSE TRUE TRUE
## APG RPG TOPG BPG SPG
## FALSE FALSE FALSE FALSE FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + MPG + PPG,train2)
summary(lm3)
##
## Call:
## lm(formula = log(salary) ~ Age + MPG + PPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18382 -0.38717 -0.05981 0.42089 1.20669
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.38183 0.67270 16.920 < 2e-16 ***
## Age 0.06997 0.02354 2.973 0.00464 **
## MPG 0.07055 0.02137 3.301 0.00184 **
## PPG 0.04798 0.02631 1.823 0.07460 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5976 on 47 degrees of freedom
## Multiple R-squared: 0.7231, Adjusted R-squared: 0.7054
## F-statistic: 40.91 on 3 and 47 DF, p-value: 3.743e-13
plot(lm3)
#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * MPG * PPG + I(Age^2) + I(MPG^2) + I(PPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm3
## BP = 8.1532, df = 10, p-value = 0.6139
#not transform leads to a low p-value of heteroscedasticity
lm4 = lm(salary~Age + MPG + PPG,train2)
bptest(lm4, ~ Age * MPG * PPG + I(Age^2) + I(MPG^2) + I(PPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm4
## BP = 15.514, df = 10, p-value = 0.1144
#not a sign of multicollinearity
library(car)
vif(lm3)
## Age MPG PPG
## 1.020484 4.258776 4.235640
Now, remove the MPG, and only consider the skills of the player that might affect his salary.
b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age + PPG,train2)
summary(lm6)
##
## Call:
## lm(formula = log(salary) ~ Age + PPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47386 -0.44682 -0.03799 0.35105 1.20251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.09741 0.69938 17.297 < 2e-16 ***
## Age 0.07571 0.02578 2.937 0.00508 **
## PPG 0.12372 0.01415 8.746 1.68e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6563 on 48 degrees of freedom
## Multiple R-squared: 0.6589, Adjusted R-squared: 0.6447
## F-statistic: 46.36 on 2 and 48 DF, p-value: 6.164e-12
#compare rmse for original model and reduced model
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_SG),log(train_SG$salary))
## [1] 0.6588942
rmse(predict(lm3,train_SG),log(train_SG$salary))
## [1] 0.6575775
rmse(predict(lm6,train_SG),log(train_SG$salary))
## [1] 0.7322491
predict(lm6, train_SG, interval="predict")
## fit lwr upr
## 8 15.02092 13.68493 16.35691
## 9 14.49684 13.14663 15.84704
## 13 15.62335 14.28466 16.96204
## 17 14.68138 13.33830 16.02446
## 26 14.93808 13.55885 16.31730
## 28 15.90603 14.56997 17.24208
## 30 14.84685 13.50354 16.19015
## 37 15.44475 14.10786 16.78163
## 39 16.71635 15.32909 18.10360
## 43 14.35012 12.99581 15.70443
## 48 14.77284 13.42847 16.11720
## 49 15.65892 14.31971 16.99812
## 67 16.00651 14.64900 17.36401
## 76 14.35056 12.99241 15.70872
## 79 15.42927 14.08501 16.77352
## 86 14.97116 13.63298 16.30935
## 92 14.50853 13.14460 15.87247
## 96 17.06295 15.68685 18.43906
## 101 15.17262 13.83142 16.51381
## 111 15.83516 14.49763 17.17268
## 119 14.17161 12.80257 15.54065
## 123 16.23571 14.82380 17.64762
## 124 15.75936 14.42518 17.09354
## 131 16.52186 15.17159 17.87212
## 145 16.01997 14.66245 17.37748
## 168 16.71333 15.33787 18.08880
## 169 15.54402 14.18250 16.90554
## 180 17.98227 16.53942 19.42511
## 191 15.52863 14.00739 17.04987
## 205 15.26699 13.92683 16.60716
## 206 16.95810 15.58775 18.32846
## 208 15.15130 13.80156 16.50103
## 224 15.94229 14.60620 17.27839
## 226 15.70705 14.36708 17.04703
## 241 15.72264 14.38869 17.05660
## 252 15.80861 14.47405 17.14318
## 253 15.57305 14.22873 16.91737
## 259 16.61777 15.25998 17.97557
## 272 15.28382 13.95051 16.61712
## 274 14.83387 13.49453 16.17321
## 278 17.23528 15.85194 18.61862
## 293 14.01261 12.64281 15.38241
## 295 15.09893 13.75091 16.44695
## 328 15.41875 14.05472 16.78278
## 329 15.72361 14.38695 17.06027
## 337 14.59499 13.24720 15.94278
## 344 14.65561 13.31166 15.99956
## 368 15.80893 14.46703 17.15084
## 390 14.31674 12.95762 15.67587
## 396 14.26027 12.89380 15.62674
## 414 14.95403 13.61700 16.29106
## 418 14.91715 13.57947 16.25483
## 420 14.45065 13.09917 15.80212
## 427 15.16691 13.83257 16.50124
## 441 16.85231 15.46780 18.23682
Fourth for the PF group,
library(corrplot)
corrplot(cor(train_PF[,-c(1,2,3)]), method = 'number')
par(mfrow=c(2,3))
hist(train_PF$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_PF$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_PF$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_PF$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_PF$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_PF$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")
hist(train_PF$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_PF$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_PF$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_PF$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")
lm1 = lm(log(train_PF$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_PF)
summary(lm1)
##
## Call:
## lm(formula = log(train_PF$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train_PF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.26801 -0.43526 -0.09354 0.47326 1.17178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.417567 1.371366 7.596 3.30e-09 ***
## Age 0.138161 0.029515 4.681 3.41e-05 ***
## FG. -1.294810 2.163701 -0.598 0.5530
## FT. 0.636577 1.065327 0.598 0.5536
## MPG -0.001643 0.038856 -0.042 0.9665
## PPG 0.076128 0.046796 1.627 0.1118
## APG 0.289919 0.170015 1.705 0.0961 .
## RPG 0.234687 0.110705 2.120 0.0404 *
## TOPG -0.813774 0.529994 -1.535 0.1327
## BPG 0.038748 0.291752 0.133 0.8950
## SPG -0.018530 0.497542 -0.037 0.9705
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6726 on 39 degrees of freedom
## Multiple R-squared: 0.681, Adjusted R-squared: 0.5992
## F-statistic: 8.325 on 10 and 39 DF, p-value: 4.839e-07
plot(lm1)
library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 308 -2.064256 0.045869 NA
influencePlot(lm1)
train2 = subset(train_PF, train_PF$X != 118 & train_PF$X != 251 & train_PF$X != 262 & train_PF$X != 276 & train_PF$X != 308 & train_PF$X != 458)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
##
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0158 -0.3305 -0.1056 0.4281 0.9896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.98857 1.32042 7.565 1.06e-08 ***
## Age 0.16526 0.03563 4.638 5.33e-05 ***
## FG. -1.43707 2.12482 -0.676 0.5035
## FT. 0.49670 1.07943 0.460 0.6484
## MPG -0.01472 0.04534 -0.325 0.7475
## PPG 0.10276 0.05899 1.742 0.0908 .
## APG 0.33071 0.24684 1.340 0.1895
## RPG 0.16639 0.11161 1.491 0.1455
## TOPG -0.66710 0.57397 -1.162 0.2535
## BPG 0.20804 0.45248 0.460 0.6487
## SPG -0.05157 0.46642 -0.111 0.9126
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.624 on 33 degrees of freedom
## Multiple R-squared: 0.719, Adjusted R-squared: 0.6338
## F-statistic: 8.443 on 10 and 33 DF, p-value: 1.265e-06
plot(lm2)
#AIC: Age + PPG
#and Mallow's Cp: Age + PPG + APG + RPG + TOPG EXISTS MULTICOOLIENEARITY
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)
rs$which[which.min(rs$cp),]
## (Intercept) Age FG. FT. MPG PPG
## TRUE TRUE FALSE FALSE FALSE TRUE
## APG RPG TOPG BPG SPG
## FALSE FALSE FALSE FALSE FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + PPG,train2)
summary(lm3)
##
## Call:
## lm(formula = log(salary) ~ Age + PPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.31483 -0.35817 0.02036 0.42330 1.02781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.70125 0.74194 13.075 3.24e-16 ***
## Age 0.16959 0.02618 6.477 9.07e-08 ***
## PPG 0.12225 0.01630 7.500 3.28e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5921 on 41 degrees of freedom
## Multiple R-squared: 0.6856, Adjusted R-squared: 0.6703
## F-statistic: 44.71 on 2 and 41 DF, p-value: 4.983e-11
plot(lm3)
#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * PPG + I(Age^2) + I(PPG^2) , data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm3
## BP = 4.1702, df = 5, p-value = 0.5252
#not transform
lm4 = lm(salary~Age + PPG ,train2)
bptest(lm4, ~ Age * PPG + I(Age^2) + I(PPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm4
## BP = 5.6517, df = 5, p-value = 0.3416
#no sign of multicollinearity
library(car)
vif(lm3)
## Age PPG
## 1.009991 1.009991
Now, remove the MPG, and only consider the skills of the player that might affect his salary.
b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age+PPG+APG+RPG+TOPG,train2)
summary(lm6)
##
## Call:
## lm(formula = log(salary) ~ Age + PPG + APG + RPG + TOPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0558 -0.3414 -0.0812 0.3528 0.9883
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.72676 0.74297 13.092 1.17e-15 ***
## Age 0.16139 0.02650 6.090 4.28e-07 ***
## PPG 0.10014 0.03620 2.766 0.00871 **
## APG 0.31670 0.20653 1.533 0.13344
## RPG 0.13813 0.09058 1.525 0.13555
## TOPG -0.68436 0.51223 -1.336 0.18948
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5893 on 38 degrees of freedom
## Multiple R-squared: 0.7114, Adjusted R-squared: 0.6734
## F-statistic: 18.73 on 5 and 38 DF, p-value: 2.36e-09
#compare rmse for original model and reduced model
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_PF),log(train_PF$salary))
## [1] 0.633717
rmse(predict(lm3,train_PF),log(train_PF$salary))
## [1] 0.6672642
rmse(predict(lm6,train_PF),log(train_PF$salary))
## [1] 0.6296484
predict(lm6, train_PF, interval="predict")
## fit lwr upr
## 2 15.61036 14.34463 16.87609
## 6 15.67198 14.37449 16.96947
## 23 16.06217 14.79191 17.33243
## 33 17.28629 15.85959 18.71299
## 54 17.02343 15.70466 18.34221
## 82 15.34615 14.09219 16.60011
## 88 14.61347 13.36189 15.86505
## 98 16.65687 15.42056 17.89318
## 118 16.56938 14.64690 18.49185
## 132 14.64953 13.36524 15.93381
## 133 16.16674 14.93998 17.39349
## 136 15.15574 13.89765 16.41383
## 147 17.00424 15.65314 18.35533
## 152 13.63947 12.36798 14.91097
## 153 16.06586 14.78669 17.34504
## 170 14.82259 13.57454 16.07064
## 181 16.21979 14.87783 17.56176
## 184 15.79489 14.46497 17.12481
## 185 15.59436 14.29481 16.89391
## 187 14.32602 13.04868 15.60336
## 200 14.59255 13.35651 15.82858
## 221 15.53831 14.29732 16.77929
## 232 14.30994 13.07202 15.54786
## 249 15.39417 14.10383 16.68451
## 251 15.18076 13.94938 16.41214
## 262 15.50669 13.98011 17.03328
## 266 15.09471 13.78960 16.39982
## 273 14.96296 13.72332 16.20259
## 276 18.98504 17.10004 20.87003
## 279 15.42891 14.09046 16.76736
## 294 15.94465 14.71034 17.17896
## 303 16.15104 14.91907 17.38301
## 308 14.84792 13.61770 16.07815
## 325 15.52595 14.30644 16.74546
## 334 16.19328 14.91247 17.47410
## 342 15.95398 14.68440 17.22357
## 343 14.84329 13.54818 16.13840
## 347 14.90394 13.67226 16.13562
## 353 14.85074 13.61870 16.08278
## 377 15.88844 14.64575 17.13113
## 379 14.03916 12.79349 15.28484
## 382 13.86332 12.61043 15.11621
## 383 15.82604 14.56953 17.08256
## 401 16.73463 15.46009 18.00918
## 407 16.16808 14.94582 17.39034
## 413 16.23805 14.91792 17.55818
## 422 15.63290 14.40313 16.86268
## 424 14.75007 13.50228 15.99787
## 429 14.33206 13.07270 15.59143
## 458 17.25709 15.85288 18.66130
Finally for the SF group, need to proceed with caution because data point is few
library(corrplot)
corrplot(cor(train_SF[,-c(1,2,3)]), method = 'number')
par(mfrow=c(2,3))
hist(train_SF$Age,freq = F, main = "Histogram of Age", xlab= "Age")
hist(train_SF$FG.,freq = F, main = "Histogram of Filed Goals per Game", xlab= "Filed Goals per Game")
hist(train_SF$FT.,freq = F, main = "Histogram of Free Throws per Game", xlab= "Free Throws per Game")
hist(train_SF$MPG,freq = F, main = "Histogram of Minutes Played per Game", xlab= "Minutes Played per Game")
hist(train_SF$PPG,freq = F, main = "Histogram of Points per Game", xlab= "Points per Game")
hist(train_SF$APG,freq = F, main = "Histogram of Assits per Game", xlab= "Assits per Game")
hist(train_SF$RPG,freq = F, main = "Histogram of Rebounds per Game", xlab= "Rebounds per Game")
hist(train_SF$TOPG,freq = F, main = "Histogram of Turnovers per Game", xlab= "Turnovers per Game")
hist(train_SF$BPG,freq = F, main = "Histogram of Blocks per Game", xlab= "Blocks per Game")
hist(train_SF$SPG,freq = F, main = "Histogram of Steals per Game", xlab= "Steals per Game")
lm1 = lm(log(train_SF$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train_SF)
summary(lm1)
##
## Call:
## lm(formula = log(train_SF$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train_SF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.00452 -0.42213 0.05411 0.44955 1.65939
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.81493 2.29050 3.848 0.00073 ***
## Age 0.12539 0.04495 2.790 0.00994 **
## FG. 4.33691 3.64171 1.191 0.24488
## FT. 0.41396 2.24372 0.184 0.85511
## MPG 0.02813 0.04995 0.563 0.57834
## PPG 0.18076 0.09295 1.945 0.06315 .
## APG -0.37214 0.43300 -0.859 0.39826
## RPG -0.28244 0.22314 -1.266 0.21726
## TOPG -0.23898 1.04375 -0.229 0.82076
## BPG 0.52375 0.67757 0.773 0.44678
## SPG 0.21274 0.57001 0.373 0.71213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9418 on 25 degrees of freedom
## Multiple R-squared: 0.5985, Adjusted R-squared: 0.4379
## F-statistic: 3.727 on 10 and 25 DF, p-value: 0.003658
plot(lm1)
library(car)
outlierTest(lm1)
## No Studentized residuals with Bonferroni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferroni p
## 264 -3.30469 0.0029776 0.10719
influencePlot(lm1)
train2 = subset(train_SF, train_SF$X != 235 & train_SF$X != 254 & train_SF$X != 264 & train_SF$X != 350)
lm2 = lm(log(train2$salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG,train2)
summary(lm2)
##
## Call:
## lm(formula = log(train2$salary) ~ Age + FG. + FT. + MPG + PPG +
## APG + RPG + TOPG + BPG + SPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.23421 -0.42048 -0.03894 0.46638 1.03455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.77632 2.11238 5.575 1.56e-05 ***
## Age 0.10796 0.03827 2.821 0.0102 *
## FG. 1.65227 3.09157 0.534 0.5986
## FT. -1.80401 2.07271 -0.870 0.3939
## MPG 0.04705 0.04770 0.986 0.3352
## PPG 0.14591 0.08838 1.651 0.1136
## APG 0.40514 0.42611 0.951 0.3525
## RPG -0.34705 0.20498 -1.693 0.1052
## TOPG -0.89746 0.95123 -0.943 0.3562
## BPG 1.53581 0.91001 1.688 0.1063
## SPG -0.19293 0.60743 -0.318 0.7539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.746 on 21 degrees of freedom
## Multiple R-squared: 0.654, Adjusted R-squared: 0.4892
## F-statistic: 3.969 on 10 and 21 DF, p-value: 0.003756
plot(lm2)
#AIC and Mallow's Cp select model yields the same model
library(leaps)
b = regsubsets(log(salary)~Age+FG.+FT.+MPG+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
plot(2:9,rs$cp, ylab = "CP statistic", xlab = "Number of Parameters")
abline(0,1)
rs$which[which.min(rs$cp),]
## (Intercept) Age FG. FT. MPG PPG
## TRUE TRUE FALSE FALSE FALSE TRUE
## APG RPG TOPG BPG SPG
## FALSE TRUE FALSE TRUE FALSE
#step(lm2)
lm3 = lm(log(salary)~Age + PPG + RPG + BPG,train2)
summary(lm3)
##
## Call:
## lm(formula = log(salary) ~ Age + PPG + RPG + BPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33417 -0.40081 -0.07458 0.37252 1.41848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.04769 0.85467 12.926 4.44e-13 ***
## Age 0.12118 0.02936 4.127 0.000316 ***
## PPG 0.16632 0.03905 4.259 0.000222 ***
## RPG -0.28674 0.15802 -1.815 0.080713 .
## BPG 1.49306 0.71904 2.076 0.047496 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7114 on 27 degrees of freedom
## Multiple R-squared: 0.5954, Adjusted R-squared: 0.5355
## F-statistic: 9.933 on 4 and 27 DF, p-value: 4.475e-05
plot(lm3)
#no sign of heteroscedasticity
library(lmtest)
bptest(lm3, ~ Age * PPG * RPG * BPG + I(Age^2) + I(PPG^2) + I(RPG^2) + I(BPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm3
## BP = 13.562, df = 19, p-value = 0.8086
#not transform
lm4 = lm(salary~Age + PPG + RPG + BPG,train2)
bptest(lm4, ~ Age * PPG * RPG * BPG + I(Age^2) + I(PPG^2) + I(RPG^2) + I(BPG^2), data = train2)
##
## studentized Breusch-Pagan test
##
## data: lm4
## BP = 11.547, df = 19, p-value = 0.9041
#no sign of multicollinearity
library(car)
vif(lm3)
## Age PPG RPG BPG
## 1.003207 1.875995 2.842068 1.742353
Now, remove the MPG, and only consider the skills of the player that might affect his salary.
b = regsubsets(log(salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG, data = train2)
rs = summary(b)
lm5 = lm(log(train2$salary)~Age+FG.+FT.+PPG+APG+RPG+TOPG+BPG+SPG,train2)
#step(lm5)
lm6 = lm(log(salary)~ Age + PPG + RPG + BPG,train2)
summary(lm6)
##
## Call:
## lm(formula = log(salary) ~ Age + PPG + RPG + BPG, data = train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.33417 -0.40081 -0.07458 0.37252 1.41848
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.04769 0.85467 12.926 4.44e-13 ***
## Age 0.12118 0.02936 4.127 0.000316 ***
## PPG 0.16632 0.03905 4.259 0.000222 ***
## RPG -0.28674 0.15802 -1.815 0.080713 .
## BPG 1.49306 0.71904 2.076 0.047496 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7114 on 27 degrees of freedom
## Multiple R-squared: 0.5954, Adjusted R-squared: 0.5355
## F-statistic: 9.933 on 4 and 27 DF, p-value: 4.475e-05
#compare rmse for original model and reduced model
rmse=function(x,y){sqrt(mean((x-y)^2))}
rmse(predict(lm2,train_SF),log(train_SF$salary))
## [1] 0.9533102
#rmse(predict(lm3,train_SF),log(train_SF$salary))
rmse(predict(lm6,train_SF),log(train_SF$salary))
## [1] 0.9587561
predict(lm6, train_SF, interval="predict")
## fit lwr upr
## 3 14.34873 12.81058 15.88689
## 20 16.33546 14.69209 17.97882
## 38 15.99437 14.43598 17.55277
## 50 16.16912 14.62463 17.71361
## 55 13.87185 12.30206 15.44164
## 56 15.61327 14.11485 17.11169
## 83 15.34755 13.84107 16.85403
## 110 14.94586 13.42144 16.47028
## 116 15.07882 13.58516 16.57248
## 134 16.54660 14.90336 18.18984
## 135 15.59063 14.10057 17.08070
## 172 15.33281 13.84230 16.82331
## 174 14.03587 12.43613 15.63560
## 211 15.82829 14.20332 17.45327
## 235 16.47526 14.71924 18.23128
## 240 14.56215 13.05394 16.07037
## 254 19.60119 17.07727 22.12511
## 258 16.43131 14.78497 18.07766
## 263 14.94690 13.32929 16.56452
## 264 14.18436 12.61192 15.75680
## 282 14.96067 13.44800 16.47333
## 297 14.85752 13.34479 16.37024
## 307 15.34209 13.72725 16.95692
## 312 14.96198 13.45674 16.46721
## 341 14.95646 13.42040 16.49251
## 350 17.08479 15.41507 18.75450
## 352 14.22532 12.69711 15.75353
## 366 16.20897 14.60052 17.81741
## 372 15.26702 13.60959 16.92445
## 373 14.14819 12.58938 15.70701
## 412 14.41647 12.86404 15.96889
## 421 15.89806 14.31743 17.47869
## 442 16.85111 15.11270 18.58951
## 447 16.00273 14.33357 17.67189
## 448 16.39949 14.85966 17.93931
## 453 15.58639 14.00952 17.16327